library(tidyverse)
library(rmarkdown)
library(lubridate)
library(ggrepel)
states <- map_data("state")
state_names <- states %>% group_by(region) %>% summarize(long=mean(long), lat=mean(lat))
acc <- read.csv("https://raw.githubusercontent.com/xdaiISU/ds202materials/master/hwlabs/fars2017/accident.csv", stringsAsFactors = FALSE)
ppl <- read.csv("https://raw.githubusercontent.com/xdaiISU/ds202materials/master/hwlabs/fars2017/person.csv", stringsAsFactors = FALSE)
glc <- readxl::read_xlsx("FRPP_GLC.xlsx")

Preprocessing

# Preprocess GLC
glc$STATE = as.numeric(glc$`State Code`)

glc_states <- glc %>% group_by(STATE) %>% summarize(state_name=first(`State Name`))
acc_with_state_names <- acc %>% left_join(glc_states, by=c('STATE'))

1

acc %>% group_by(DAY_WEEK) %>%
  summarize(number_accidents=n()) %>%
  mutate(weekday=c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')) %>%
  select(-DAY_WEEK) %>%
  subset(select=c(2,1))
## # A tibble: 7 x 2
##   weekday number_accidents
##   <chr>              <int>
## 1 Sun                 5360
## 2 Mon                 4374
## 3 Tue                 4347
## 4 Wed                 4314
## 5 Thu                 4621
## 6 Fri                 5358
## 7 Sat                 5873

Using the data manual, we know that values of DAY_WEEK 1-7 corresponds to days, Sun-Sat, which we can use to get the weekday label for each integer representation. In the resulting table, we can see that on Fridays, Saturdays, and Sundays, there are generally a significantly greater number of fatal accidents compared to Monday, Tuesday, Wednesday, or Thursday.

The greatest number of fatal accidents occur on Saturdays.

2

fatal <- ppl %>% filter(INJ_SEV==4)
head(fatal)
##   STATE ST_CASE VE_FORMS VEH_NO PER_NO STR_VEH COUNTY DAY MONTH HOUR MINUTE
## 1     1   10001        1      1      1       0     73  19     2   23     35
## 2     1   10002        1      1      1       0     89  14     2   14     59
## 3     1   10003        3      2      1       0    101  31     1   20     31
## 4     1   10004        1      1      1       0     73   1     1   16     55
## 5     1   10005        1      1      1       0     13   1     1   20      0
## 6     1   10006        2      1      1       0     49   6     1   18     40
##   RUR_URB FUNC_SYS HARM_EV MAN_COLL SCH_BUS MAKE MAK_MOD BODY_TYP MOD_YEAR
## 1       2        1      38        0       0   20   20421       15     2004
## 2       2        1       1        0       0   37   37402       14     2005
## 3       2        1      12        1       0    2    2404       14     2003
## 4       2        4      30        0       0   30   30046        4     2014
## 5       1        1      35        0       0   12   12471       34     2010
## 6       1        1      34        0       0   49   49032        4     1997
##   TOW_VEH SPEC_USE EMER_USE ROLLOVER IMPACT1 FIRE_EXP AGE SEX PER_TYP INJ_SEV
## 1       1        0        0        0      12        0  42   1       1       4
## 2       0        0        0        9       0        0  43   1       1       4
## 3       0        0        0        0       6        0  47   1       1       4
## 4       0        0        0        1      11        0  18   1       1       4
## 5       0        0        0        1      12        0  67   2       1       4
## 6       0        0        0        1      11        0  18   2       1       4
##   SEAT_POS REST_USE REST_MIS AIR_BAG EJECTION EJ_PATH EXTRICAT DRINKING ALC_DET
## 1       11       20        0       1        1       9        0        0       9
## 2       11        3        0       8        0       0        0        9       9
## 3       11        3        0      99        0       0        0        0       9
## 4       11        3        0       8        0       0        0        9       9
## 5       11        3        0       1        0       0        0        0       9
## 6       11       20        0      20        1       9        0        0       9
##   ALC_STATUS ATST_TYP ALC_RES DRUGS DRUG_DET DSTATUS DRUGTST1 DRUGTST2 DRUGTST3
## 1          0        0     996     0        8       0        0        0        0
## 2          2        1       0     9        8       2        1        1        1
## 3          2        1       0     0        8       2        1        0        0
## 4          0        0     996     9        8       0        0        0        0
## 5          0        0     996     0        8       0        0        0        0
## 6          0        0     996     0        8       0        0        0        0
##   DRUGRES1 DRUGRES2 DRUGRES3 HOSPITAL DOA DEATH_DA DEATH_MO DEATH_YR DEATH_HR
## 1        0        0        0        0   7       19        2     2017       23
## 2      605      600      996        0   7       14        2     2017       14
## 3        1        0        0        0   7       31        1     2017       20
## 4        0        0        0        0   7        1        1     2017       16
## 5        0        0        0        0   7        1        1     2017       20
## 6        0        0        0        0   7        6        1     2017       18
##   DEATH_MN DEATH_TM LAG_HRS LAG_MINS P_SF1 P_SF2 P_SF3 WORK_INJ HISPANIC RACE
## 1       35     2335       0        0     0     0     0        0        7    1
## 2       59     1459       0        0     0     0     0        0        7    1
## 3       31     2031       0        0     0     0     0        0        7    2
## 4       55     1655       0        0     0     0     0        0        7    2
## 5        0     2000       0        0     0     0     0        0        7    1
## 6       40     1840       0        0     0     0     0        0        7    1
##   LOCATION
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0

Here we see a few entries of the dataset created which contains only the people who suffered fatal injuries.

3

# First get the MAKE for each accident
dat <- acc_with_state_names %>% inner_join(ppl, by=c('ST_CASE', 'STATE')) %>% select(STATE, state_name, MAKE)

most_dangerous <- dat %>%
  filter(!is.na(MAKE)) %>%
  group_by(STATE, state_name, MAKE) %>%
  summarize(number_accidents=n()) %>%
  filter(number_accidents==max(number_accidents))
## `summarise()` has grouped output by 'STATE', 'state_name'. You can override using the `.groups` argument.
most_dangerous %>% paged_table

Here we can see the most dangerous make ID for each state ID, as well as the number of fatal accidents for each of those state/make combinations.

4

states$region = toupper(states$region)

avg_state_locations <- states %>% group_by(region) %>% summarize(avg_long=mean(long), avg_lat=mean(lat))

dangerous_with_location <- most_dangerous %>% mutate(region=state_name) %>% inner_join(avg_state_locations, by=c('region'))

ggplot(states, aes(x=long, y=lat)) + geom_polygon(aes(group=group)) +
  geom_text_repel(data=dangerous_with_location, aes(x=avg_long, y=avg_lat, label=MAKE), color='green')

Here we can see the most dangerous vehicle (most fatal accidents) make code for each state.

5

acc_ppl <- acc %>% inner_join(ppl)
## Joining, by = c("STATE", "ST_CASE", "VE_FORMS", "COUNTY", "DAY", "MONTH", "HOUR", "MINUTE", "RUR_URB", "FUNC_SYS", "HARM_EV", "MAN_COLL", "SCH_BUS")
paged_table(acc_ppl)

Show above are a few entries of the joined dataframe. ST_CASE identifies each accident, so we could join by that variable, but we can also just allow R to match by all matching columns to get an equivalent result.

6

acc %>% group_by(DAY_WEEK) %>%
  summarize(number_accidents=n()) %>%
  mutate(weekday=c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')) %>%
  select(-DAY_WEEK) %>%
  subset(select=c(2,1)) %>%
  ggplot(aes(x=weekday, weight=number_accidents)) + geom_bar() +
  xlab('Day of Week') + ylab('Number of Accidents')

acc %>% group_by(HOUR) %>%
  filter(HOUR != 99) %>%
  summarize(number_accidents=n()) %>%
  ggplot(aes(x=HOUR, weight=number_accidents)) + geom_bar() +
  xlab('Hour in Day') + ylab('Number of Accidents')

acc_ppl %>% group_by(SEX) %>%
  summarize(number_accidents=n()) %>%
  ggplot(aes(x=SEX, weight=number_accidents)) + geom_bar() +
  xlab('Sex Attribute Code') + ylab('Number of Accidents')

We can see from these summaries that Saturday is the most common day on which fatal accidents occur, fatal accidents are most common around hour 18 (6 PM), and that males (Sex Attribute 1) are about twice as likely to be in an accident that females (Sex Attribute 2).

7

counties <- map_data('county')

deaths_by_county <- acc_ppl %>% 
  filter(INJ_SEV==4) %>% 
  group_by(STATE, COUNTY) %>%
  summarize(total_deaths=n())
## `summarise()` has grouped output by 'STATE'. You can override using the `.groups` argument.
glc_counties <- glc %>%
  mutate(STATE=as.numeric(`State Code`), COUNTY=as.numeric(`County Code`), region=tolower(`State Name`), subregion=tolower(`County Name`)) %>%
  group_by(region, subregion, STATE, COUNTY) %>%
  summarize(region=first(region), subregion=first(subregion), STATE=first(STATE), COUNTY=first(COUNTY))
## `summarise()` has grouped output by 'region', 'subregion', 'STATE'. You can override using the `.groups` argument.
#glc_loc %>% glc_counties %>% left_join()
#glc_counties
#deaths_by_county
#counties

deaths_by_county_with_names <- deaths_by_county %>% left_join(glc_counties, by=c('STATE', 'COUNTY'))
deaths_by_county_with_names
## # A tibble: 2,843 x 5
## # Groups:   STATE [51]
##    STATE COUNTY total_deaths region  subregion
##    <dbl>  <dbl>        <int> <chr>   <chr>    
##  1     1      1           14 alabama autauga  
##  2     1      3           26 alabama baldwin  
##  3     1      5            6 alabama barbour  
##  4     1      7            8 alabama bibb     
##  5     1      9           23 alabama blount   
##  6     1     11            6 alabama bullock  
##  7     1     13           14 alabama butler   
##  8     1     15           21 alabama calhoun  
##  9     1     17            5 alabama chambers 
## 10     1     19           10 alabama cherokee 
## # ... with 2,833 more rows
#counties

dat <- counties %>% inner_join(deaths_by_county_with_names, by=c('region', 'subregion'))

#counties_with_codes <- counties %>% inner_join(glc_counties, by=c('region', 'subregion'))
#counties_with_codes
#dat
ggplot(dat, aes(x=long, y=lat), fill=total_deaths) + geom_polygon(aes(group=group))